From c03153348098fbdd905163af95ecbabbd43f31bb Mon Sep 17 00:00:00 2001 From: Yang Hongyang Date: Fri, 18 Jul 2014 17:14:22 +0800 Subject: [PATCH] libxl/remus: setup and control disk replication for DRBD backends This patch adds the machinery required for protecting a guest's disk state, when the guest disk uses a DRBD disk backend. This patch comprises of two parts: 1. Hotplug scripts: The block-drbd-probe script is responsible for performing sanity checks on the state of the DRBD disk before the checkpointing process begins. This script should be invoked by libxl for each of the guest's disk devices, when starting Remus. 2. Remus drbd disk device: Implements the interfaces required by the remus abstract device layer. A note about the implementation: a) setup() is called for each disk attached to the guest. During setup(): i) The hotplug script is called to perform the sanity check. ii) Libxl obtains a handle to the DRBD device (/dev/drbd*) and and subsequently controls disk checkpoint replication using this handle in the checkpoint callbacks. c) The preresume() checkpoint callback is executed asynchronously using libxl__ev_child_fork(), as it may potentially block for more than few seconds in case of backup failure. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Signed-off-by: Yang Hongyang Signed-off-by: Ian Jackson Signed-off-by: Shriram Rajagopalan Acked-by: Ian Jackson Acked-by: Konrad Rzeszutek Wilk --- docs/README.remus | 10 ++ tools/hotplug/Linux/Makefile | 1 + tools/hotplug/Linux/block-drbd-probe | 87 +++++++++ tools/libxl/Makefile | 2 +- tools/libxl/libxl.c | 1 + tools/libxl/libxl_internal.h | 5 + tools/libxl/libxl_remus_device.c | 7 + tools/libxl/libxl_remus_disk_drbd.c | 258 +++++++++++++++++++++++++++ 8 files changed, 370 insertions(+), 1 deletion(-) create mode 100755 tools/hotplug/Linux/block-drbd-probe create mode 100644 tools/libxl/libxl_remus_disk_drbd.c diff --git a/docs/README.remus b/docs/README.remus index ddf5b5558f..20783c93da 100644 --- a/docs/README.remus +++ b/docs/README.remus @@ -8,3 +8,13 @@ Using Remus with libxl on Xen 4.5 and higher: or higher along with the development headers and command line utilities. If your distro does not have the appropriate libnl3 version, you can find the latest source tarball of libnl3 at http://www.carisma.slowglass.com/~tgr/libnl/ + +Disk replication: + VMs protected by Remus need to use DRBD based disk backends. Specifically, you + need a compile and install a custom version of DRBD, that is available publicly + at https://github.com/rshriram/remus-drbd + This code is based on DRBD 8.3.11 and uses a new replication protocol (named + protocol D) for asynchronous disk checkpoint replication. A protected VM's DRBD + disks on the primary and backup hosts need to be configured to use protocol D + as the replication protocol. An example resource configuration file can be found + in the aforementioned github repository. diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile index 31e57f76ad..5317feff26 100644 --- a/tools/hotplug/Linux/Makefile +++ b/tools/hotplug/Linux/Makefile @@ -24,6 +24,7 @@ XEN_SCRIPTS += xen-hotplug-cleanup XEN_SCRIPTS += external-device-migrate XEN_SCRIPTS += vscsi XEN_SCRIPTS += block-iscsi +XEN_SCRIPTS += block-drbd-probe XEN_SCRIPTS += $(XEN_SCRIPTS-y) SUBDIRS-$(CONFIG_SYSTEMD) += systemd diff --git a/tools/hotplug/Linux/block-drbd-probe b/tools/hotplug/Linux/block-drbd-probe new file mode 100755 index 0000000000..247a9d098d --- /dev/null +++ b/tools/hotplug/Linux/block-drbd-probe @@ -0,0 +1,87 @@ +#! /bin/bash +# +# Copyright (C) 2014 FUJITSU LIMITED +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Usage: +# block-drbd-probe devicename +# +# Return value: +# 0: the device is drbd device +# 1: the device is not drbd device +# 2: unkown error +# 3: the drbd device does not use protocol D +# 4: the drbd device is not ready + +set -e + +drbd_res= + +function get_res_name() +{ + local drbd_dev=$1 + local drbd_dev_list=($(drbdadm sh-dev all)) + local drbd_res_list=($(drbdadm sh-resource all)) + local temp_drbd_dev temp_drbd_res + local found=0 + + for temp_drbd_dev in ${drbd_dev_list[@]}; do + if [[ "$temp_drbd_dev" == "$drbd_dev" ]]; then + found=1 + break + fi + done + + if [[ $found -eq 0 ]]; then + return 1 + fi + + for temp_drbd_res in ${drbd_res_list[@]}; do + temp_drbd_dev=$(drbdadm sh-dev $temp_drbd_res) + if [[ "$temp_drbd_dev" == "$drbd_dev" ]]; then + drbd_res="$temp_drbd_res" + return 0 + fi + done + + # OOPS + return 2 +} + +get_res_name $1 +rc=$? +if [[ $rc -ne 0 ]]; then + exit $rc +fi + +# check protocol +drbdsetup $1 show | grep -q "protocol D;" +if [[ $? -ne 0 ]]; then + exit 3 +fi + +# check connect status +state=$(drbdadm cstate "$drbd_res") +if [[ "$state" != "Connected" ]]; then + exit 4 +fi + +# check role +role=$(drbdadm role "$drbd_res") +if [[ "$role" != "Primary/Secondary" ]]; then + exit 4 +fi + +exit 0 diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile index da3cddb1e3..a6c3b0eb52 100644 --- a/tools/libxl/Makefile +++ b/tools/libxl/Makefile @@ -56,7 +56,7 @@ else LIBXL_OBJS-y += libxl_nonetbuffer.o endif -LIBXL_OBJS-y += libxl_remus_device.o +LIBXL_OBJS-y += libxl_remus_device.o libxl_remus_disk_drbd.o LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c index 27fdfc20e6..1856ae518b 100644 --- a/tools/libxl/libxl.c +++ b/tools/libxl/libxl.c @@ -825,6 +825,7 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info, goto out; } rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VIF); + rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VBD); rds->ao = ao; rds->domid = domid; diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h index 2776d19a95..b87c5e2189 100644 --- a/tools/libxl/libxl_internal.h +++ b/tools/libxl/libxl_internal.h @@ -2667,6 +2667,8 @@ struct libxl__remus_device_instance_ops { int init_subkind_nic(libxl__remus_devices_state *rds); void cleanup_subkind_nic(libxl__remus_devices_state *rds); +int init_subkind_drbd_disk(libxl__remus_devices_state *rds); +void cleanup_subkind_drbd_disk(libxl__remus_devices_state *rds); typedef void libxl__remus_callback(libxl__egc *, libxl__remus_devices_state *, int rc); @@ -2709,6 +2711,9 @@ struct libxl__remus_devices_state { char *netbufscript; struct nl_sock *nlsock; struct nl_cache *qdisc_cache; + + /* private for drbd disk subkind ops */ + char *drbd_probe_script; }; /* diff --git a/tools/libxl/libxl_remus_device.c b/tools/libxl/libxl_remus_device.c index b20168f367..a6cb7f61ca 100644 --- a/tools/libxl/libxl_remus_device.c +++ b/tools/libxl/libxl_remus_device.c @@ -18,8 +18,10 @@ #include "libxl_internal.h" extern const libxl__remus_device_instance_ops remus_device_nic; +extern const libxl__remus_device_instance_ops remus_device_drbd_disk; static const libxl__remus_device_instance_ops *remus_ops[] = { &remus_device_nic, + &remus_device_drbd_disk, NULL, }; @@ -36,6 +38,9 @@ static int init_device_subkind(libxl__remus_devices_state *rds) if (rc) goto out; } + rc = init_subkind_drbd_disk(rds); + if (rc) goto out; + rc = 0; out: return rc; @@ -48,6 +53,8 @@ static void cleanup_device_subkind(libxl__remus_devices_state *rds) if (libxl__netbuffer_enabled(gc)) cleanup_subkind_nic(rds); + + cleanup_subkind_drbd_disk(rds); } /*----- setup() and teardown() -----*/ diff --git a/tools/libxl/libxl_remus_disk_drbd.c b/tools/libxl/libxl_remus_disk_drbd.c new file mode 100644 index 0000000000..3215f931dc --- /dev/null +++ b/tools/libxl/libxl_remus_disk_drbd.c @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2014 FUJITSU LIMITED + * Author Lai Jiangshan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; version 2.1 only. with the special + * exception on linking described in file LICENSE. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + */ + +#include "libxl_osdeps.h" /* must come before any other headers */ + +#include "libxl_internal.h" + +/*** drbd implementation ***/ +const int DRBD_SEND_CHECKPOINT = 20; +const int DRBD_WAIT_CHECKPOINT_ACK = 30; + +typedef struct libxl__remus_drbd_disk { + int ctl_fd; + int ackwait; +} libxl__remus_drbd_disk; + +int init_subkind_drbd_disk(libxl__remus_devices_state *rds) +{ + STATE_AO_GC(rds->ao); + + rds->drbd_probe_script = GCSPRINTF("%s/block-drbd-probe", + libxl__xen_script_dir_path()); + + return 0; +} + +void cleanup_subkind_drbd_disk(libxl__remus_devices_state *rds) +{ + return; +} + +/*----- helper functions, for async calls -----*/ +static void drbd_async_call(libxl__egc *egc, + libxl__remus_device *dev, + void func(libxl__remus_device *), + libxl__ev_child_callback callback) +{ + int pid = -1, rc; + libxl__ao_device *aodev = &dev->aodev; + STATE_AO_GC(dev->rds->ao); + + /* Fork and call */ + pid = libxl__ev_child_fork(gc, &aodev->child, callback); + if (pid == -1) { + LOG(ERROR, "unable to fork"); + rc = ERROR_FAIL; + goto out; + } + + if (!pid) { + /* child */ + func(dev); + /* notreached */ + abort(); + } + + return; + +out: + aodev->rc = rc; + aodev->callback(egc, aodev); +} + +/*----- match(), setup() and teardown() -----*/ + +/* callbacks */ +static void match_async_exec_cb(libxl__egc *egc, + libxl__async_exec_state *aes, + int status); + +/* implementations */ + +static void match_async_exec(libxl__egc *egc, libxl__remus_device *dev); + +static void drbd_setup(libxl__egc *egc, libxl__remus_device *dev) +{ + STATE_AO_GC(dev->rds->ao); + + match_async_exec(egc, dev); +} + +static void match_async_exec(libxl__egc *egc, libxl__remus_device *dev) +{ + int arraysize, nr = 0, rc; + const libxl_device_disk *disk = dev->backend_dev; + libxl__async_exec_state *aes = &dev->aodev.aes; + STATE_AO_GC(dev->rds->ao); + + /* setup env & args */ + arraysize = 1; + GCNEW_ARRAY(aes->env, arraysize); + aes->env[nr++] = NULL; + assert(nr <= arraysize); + + arraysize = 3; + nr = 0; + GCNEW_ARRAY(aes->args, arraysize); + aes->args[nr++] = dev->rds->drbd_probe_script; + aes->args[nr++] = disk->pdev_path; + aes->args[nr++] = NULL; + assert(nr <= arraysize); + + aes->ao = dev->rds->ao; + aes->what = GCSPRINTF("%s %s", aes->args[0], aes->args[1]); + aes->timeout_ms = LIBXL_HOTPLUG_TIMEOUT * 1000; + aes->callback = match_async_exec_cb; + aes->stdfds[0] = -1; + aes->stdfds[1] = -1; + aes->stdfds[2] = -1; + + rc = libxl__async_exec_start(gc, aes); + if (rc) + goto out; + + return; + +out: + dev->aodev.rc = rc; + dev->aodev.callback(egc, &dev->aodev); +} + +static void match_async_exec_cb(libxl__egc *egc, + libxl__async_exec_state *aes, + int status) +{ + int rc; + libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes); + libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev); + libxl__remus_drbd_disk *drbd_disk; + const libxl_device_disk *disk = dev->backend_dev; + + STATE_AO_GC(aodev->ao); + + if (status) { + rc = ERROR_REMUS_DEVOPS_DOES_NOT_MATCH; + goto out; + } + + /* ops matched */ + dev->matched = true; + + GCNEW(drbd_disk); + dev->concrete_data = drbd_disk; + drbd_disk->ackwait = 0; + drbd_disk->ctl_fd = open(disk->pdev_path, O_RDONLY); + if (drbd_disk->ctl_fd < 0) { + rc = ERROR_FAIL; + goto out; + } + + rc = 0; + +out: + aodev->rc = rc; + aodev->callback(egc, aodev); +} + +static void drbd_teardown(libxl__egc *egc, libxl__remus_device *dev) +{ + libxl__remus_drbd_disk *drbd_disk = dev->concrete_data; + STATE_AO_GC(dev->rds->ao); + + close(drbd_disk->ctl_fd); + dev->aodev.rc = 0; + dev->aodev.callback(egc, &dev->aodev); +} + +/*----- checkpointing APIs -----*/ + +/* callbacks */ +static void checkpoint_async_call_done(libxl__egc *egc, + libxl__ev_child *child, + pid_t pid, int status); + +/* API implementations */ + +/* this op will not wait and block, so implement as sync op */ +static void drbd_postsuspend(libxl__egc *egc, libxl__remus_device *dev) +{ + STATE_AO_GC(dev->rds->ao); + + libxl__remus_drbd_disk *rdd = dev->concrete_data; + + if (!rdd->ackwait) { + if (ioctl(rdd->ctl_fd, DRBD_SEND_CHECKPOINT, 0) <= 0) + rdd->ackwait = 1; + } + + dev->aodev.rc = 0; + dev->aodev.callback(egc, &dev->aodev); +} + + +static void drbd_preresume_async(libxl__remus_device *dev); + +static void drbd_preresume(libxl__egc *egc, libxl__remus_device *dev) +{ + STATE_AO_GC(dev->rds->ao); + + drbd_async_call(egc, dev, drbd_preresume_async, checkpoint_async_call_done); +} + +static void drbd_preresume_async(libxl__remus_device *dev) +{ + libxl__remus_drbd_disk *rdd = dev->concrete_data; + int ackwait = rdd->ackwait; + + if (ackwait) { + ioctl(rdd->ctl_fd, DRBD_WAIT_CHECKPOINT_ACK, 0); + ackwait = 0; + } + + _exit(ackwait); +} + +static void checkpoint_async_call_done(libxl__egc *egc, + libxl__ev_child *child, + pid_t pid, int status) +{ + int rc; + libxl__ao_device *aodev = CONTAINER_OF(child, *aodev, child); + libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev); + libxl__remus_drbd_disk *rdd = dev->concrete_data; + + STATE_AO_GC(aodev->ao); + + if (!WIFEXITED(status)) { + rc = ERROR_FAIL; + goto out; + } + + rdd->ackwait = WEXITSTATUS(status); + rc = 0; + +out: + aodev->rc = rc; + aodev->callback(egc, aodev); +} + +const libxl__remus_device_instance_ops remus_device_drbd_disk = { + .kind = LIBXL__DEVICE_KIND_VBD, + .setup = drbd_setup, + .teardown = drbd_teardown, + .postsuspend = drbd_postsuspend, + .preresume = drbd_preresume, +}; -- 2.30.2